library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(patchwork)
library(treemapify)
library(reshape2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x plyr::arrange() masks dplyr::arrange()
## x purrr::compact() masks plyr::compact()
## x plyr::count() masks dplyr::count()
## x plyr::failwith() masks dplyr::failwith()
## x dplyr::filter() masks stats::filter()
## x plyr::id() masks dplyr::id()
## x dplyr::lag() masks stats::lag()
## x plyr::mutate() masks dplyr::mutate()
## x plyr::rename() masks dplyr::rename()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
library(plotly)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
## The following object is masked from 'package:plyr':
##
## ozone
usa_deaths_states = read.csv("data/death-causes-usa.csv", sep=";") %>% filter(Cause.Name != "All causes")
spain_deaths = read.csv("data/death-causes-spain-2017-modified.csv", sep=";",fileEncoding="UTF-8-BOM")
USA data set
The data published by the Centers for Disease Control and Prevention was gathered by the National Center for Health Statistics (NCHS), with the last revision being made in 2017. This data set contains the information of the 10 leading causes of death in the United States. The data is based on information from resident death certificates filed in the 50 states and the District of Columbia using demographic and medical characteristics. The data set holds 10868 observations, each with the following 6 features:
dim(usa_deaths_states)
## [1] 9880 6
head(usa_deaths_states)
## Year X113.Cause.Name
## 1 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 2 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 3 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 4 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 5 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 6 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## Cause.Name State Deaths Age.adjusted.Death.Rate
## 1 Unintentional injuries United States 169936 49.4
## 2 Unintentional injuries Alabama 2703 53.8
## 3 Unintentional injuries Alaska 436 63.7
## 4 Unintentional injuries Arizona 4184 56.2
## 5 Unintentional injuries Arkansas 1625 51.8
## 6 Unintentional injuries California 13840 33.2
Spanish data set
The second data set details the causes of death in Spain for the year 2017. It was found on the government website, https://datos.gob.es/, although since the original project proposal is no longer available at the found location. This data was collected as part of a study done by the Spanish Institute of Statistics (INE). Due to the layout of the data it was required to transform it so it could be processed. During this process the list of disease types were reduced by combining multiple of the same type to one so it was compared to the USA data set. An example of this is the Spanish data set has 30 types of cancer listed, while the USA set has 1, so the totals for the Spanish set were totaled under the name “Cancer”. The resulting data consists of 1056 observations with the following 4 features:
* **DISEASE**: Name of cause of death
* **GENDER**: The genender of the people represented by the observation, includes Males, Females and Both (total of both males and females)
* **AGE**: The age range of the people who died in the observation
* **NUMBER.OF.DEATHS**: The number of people that died
dim(spain_deaths)
## [1] 1056 4
head(spain_deaths)
## DISEASE GENDER AGE NUMBER.OF.DEATHS
## 1 All causes Both All ages 424523
## 2 All causes Males All ages 214236
## 3 All causes Females All ages 210287
## 4 All causes Both 0 to 1 1092
## 5 All causes Males 0 to 1 619
## 6 All causes Females 0 to 1 473
As we see, we have different data for both countries, so the comparison will be tough. We will first filter for the country data, filtering state data in the US data set in order to view the country as a whole.
usa_deaths = usa_deaths_states %>% filter(State == "United States")
head(usa_deaths)
## Year X113.Cause.Name
## 1 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 2 2017 Alzheimer's disease (G30)
## 3 2017 Cerebrovascular diseases (I60-I69)
## 4 2017 Chronic lower respiratory diseases (J40-J47)
## 5 2017 Diabetes mellitus (E10-E14)
## 6 2017 Diseases of heart (I00-I09,I11,I13,I20-I51)
## Cause.Name State Deaths Age.adjusted.Death.Rate
## 1 Unintentional injuries United States 169936 49.4
## 2 Alzheimer's disease United States 121404 31.0
## 3 Stroke United States 146383 37.6
## 4 CLRD United States 160201 40.9
## 5 Diabetes United States 83564 21.5
## 6 Heart disease United States 647457 165.0
area_plot <- ggplot(usa_deaths, aes(x=Year, y=Age.adjusted.Death.Rate, fill=Cause.Name)) +
labs(title = "Trend of death causes", x = "", y = "Death Rate / 100.000 (Age Adjusted)", fill = "Causes") +
scale_color_brewer(palette = "Paired") +
theme(plot.title = element_text(hjust = 0.5)) +
theme_minimal() +
geom_area()
line_plot <- ggplot(usa_deaths, aes(x=Year, y=log(Deaths), color=Cause.Name)) +
labs(title = "Changes on death causes over the years", x = "", y = "log(Death Rate)", fill = "Causes") +
scale_color_brewer(palette = "Paired") +
theme(plot.title = element_text(hjust = 0.5)) +
theme_minimal() +
geom_line()
ggplotly(area_plot)
ggplotly(line_plot)
#Get data about the top 10 diseases in the Spanish data set. Ignore "Other causes"
topTenSpanishDiseases<- spain_deaths %>% filter(GENDER == "Both") %>% filter(AGE=="All ages") %>% filter(DISEASE!="All causes") %>% filter(DISEASE!="Other causes") %>% top_n(10, NUMBER.OF.DEATHS)
#Generate tree map
ggplot(topTenSpanishDiseases, aes(area=NUMBER.OF.DEATHS, fill=DISEASE, label=NUMBER.OF.DEATHS)) +
scale_fill_brewer(palette = "Paired") +
labs(title = "Most deadly diseases in Spain (2017)", fill = "Causes") +
geom_treemap() +
geom_treemap_text(fontface = "italic",
colour = "white",
place = "centre",
grow = FALSE,
reflow = TRUE)
topTenSpanishDiseasesByGenre <- spain_deaths %>% filter(GENDER != "Both") %>% filter(DISEASE != "All causes") %>% filter(AGE == "All ages") %>% filter(DISEASE %in% topTenSpanishDiseases$DISEASE)
topTenSpanishDiseasesByGenre$DISEASE <- with(topTenSpanishDiseasesByGenre, reorder(DISEASE, NUMBER.OF.DEATHS))
barplot <- ggplot(topTenSpanishDiseasesByGenre, aes(fill=DISEASE, y=NUMBER.OF.DEATHS, x=GENDER)) +
scale_fill_brewer(palette = "Paired") +
geom_bar(position=position_stack(), stat="identity", width=0.4) +
labs(x="", y = "Number of deaths (2017)")+
theme_minimal()
barplot
ggplotly(barplot) %>% layout(bargap=0.1) #messes with the legend
barplot <- ggplot(topTenSpanishDiseasesByGenre, aes(x=GENDER, y=NUMBER.OF.DEATHS, fill=DISEASE)) +
geom_col(colour="black",width=0.3,
position=position_stack()) +
scale_fill_brewer(palette = "Paired") +
labs(x="", y = "Number of deaths (2017)")+
theme_minimal()
barplot
ggplotly(barplot) %>% layout(legend = list()) #messes with the legend
#Create data frame to hold data about people who have Diseases of the circulatory system
diseasesCircSystemData <- spain_deaths
#Set up AGE factor for pyramid plot
diseasesCircSystemData$AGE <- factor(diseasesCircSystemData$AGE, c("0 to 1", "1 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 to 89", "90 to 94", "95 or more", "All ages"))
#Filter data to create table about only circular system disease
diseasesCircSystemData <- diseasesCircSystemData %>% filter(DISEASE == "Diseases of the circulatory system") %>% filter(GENDER != "Both") %>% filter(AGE != "All ages") %>% modify_at("DISEASE",~NULL)
summary(diseasesCircSystemData)
## GENDER AGE NUMBER.OF.DEATHS
## Both : 0 0 to 1 : 2 Min. : 0.0
## Females:21 1 to 4 : 2 1st Qu.: 14.5
## Males :21 5 to 9 : 2 Median : 348.5
## 10 to 14: 2 Mean : 2274.5
## 15 to 19: 2 3rd Qu.: 2880.8
## 20 to 24: 2 Max. :13648.0
## (Other) :30
#Mutate Male data so it will be negative on bargraph
diseasesCircSystemData <- diseasesCircSystemData %>% mutate(NUMBER.OF.DEATHS = ifelse(GENDER == "Males", -1 * NUMBER.OF.DEATHS, NUMBER.OF.DEATHS))
#Generate pyramid plot
ggplot(diseasesCircSystemData, aes(x = AGE, y = NUMBER.OF.DEATHS, fill = GENDER)) +
geom_bar(data=diseasesCircSystemData[diseasesCircSystemData$GENDER == "Females",], stat = "identity") +
geom_bar(data=diseasesCircSystemData[diseasesCircSystemData$GENDER == "Males",], stat = "identity") +
scale_y_continuous(breaks = seq(-15000, 15000, 5000),
labels = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "")) +
coord_flip(ylim=c(-15000,15000)) +
scale_fill_brewer(palette = "Set1") +
labs(title = "Number of people who die from Diseases of the circulatory system", y = "Number of deaths (1,000)", x = "Age", fill = "Gender") +
theme_bw()
unique(usa_deaths_states$State)
## [1] United States Alabama Alaska
## [4] Arizona Arkansas California
## [7] Colorado Connecticut Delaware
## [10] District of Columbia Florida Georgia
## [13] Hawaii Idaho Illinois
## [16] Indiana Iowa Kansas
## [19] Kentucky Louisiana Maine
## [22] Maryland Massachusetts Michigan
## [25] Minnesota Mississippi Missouri
## [28] Montana Nebraska Nevada
## [31] New Hampshire New Jersey New Mexico
## [34] New York North Carolina North Dakota
## [37] Ohio Oklahoma Oregon
## [40] Pennsylvania Rhode Island South Carolina
## [43] South Dakota Tennessee Texas
## [46] Utah Vermont Virginia
## [49] Washington West Virginia Wisconsin
## [52] Wyoming
## 52 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming
unique(usa_deaths_states$Cause.Name)
## [1] Unintentional injuries Alzheimer's disease
## [3] Stroke CLRD
## [5] Diabetes Heart disease
## [7] Influenza and pneumonia Suicide
## [9] Cancer Kidney disease
## 11 Levels: All causes Alzheimer's disease Cancer CLRD ... Unintentional injuries
data <- usa_deaths_states
data <- usa_deaths_states %>% filter(Year == 2017) %>% filter(State != "United States")
data <- left_join(data, aggregate(data$Deaths, by=list(State=data$State), FUN=sum), by="State")
data <- data %>% filter(Year == 2017) %>% filter(State != "United States") %>% filter(Cause.Name == "Suicide")
data$region <- tolower(data$State)
states <- map_data("state")
suicide_map <- left_join(states, data, by = "region")
death_map <- ggplot(data = suicide_map) +
geom_polygon(aes(x = long, y = lat, fill = Deaths/x, group = group), color = "white") +
coord_fixed(1.3) +
labs(x="", y="") +
theme_void() +
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "white")) +
scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
guides(fill=FALSE) # do this to leave off the color legend
death_adjusted_map <- ggplot(data = suicide_map) +
geom_polygon(aes(x = long, y = lat, fill = Age.adjusted.Death.Rate, group = group), color = "white") +
coord_fixed(1.3) +
labs(x="", y="") +
theme_void() +
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "white")) +
scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
guides(fill=FALSE) # do this to leave off the color legend
ggplotly(death_map)
ggplotly(death_adjusted_map)